
**********
* Readme *
**********

* This script estimates duration models for the appendix (domestic workers as employees) 


* Root folder (PATH TO BE DEFINED BY THE USER)
**********************************************
clear all
global analysis "C:/***/replication_package"


* Timestamped log
*****************
global today = strofreal(date(c(current_date), "DMY"), "%tdYYNNDD")
log using "${analysis}/code/logs/4_4_estimate_duration_models_appx_dom_${today}.smcl", replace


*******************************
* Import our general ML model *
*******************************

program drop _all
do "${analysis}/code/prog_duration_d1.do"

/* This ML d1 allows for 
- monotonic hazard (constant or not) 
- any type of censoring or truncation 
- optional 2-types mixture heterogeneity

NOTES: 
- t_0, t_a and t_b can have different names, but they need to appear in this specific order.
- Add the "missing" option to make sure the program won't ignore the cases with right censoring (ie those where t_b is missing) */

* Constraint for the exponential case
constraint 1 [shape]_cons = 1

* Force estimation without het terms
constraint 2 [ln_odds]_cons = -750
constraint 3 [ln_u]_cons = -750


************************************************************
* EMPLOYMENT duration (appendix, dom workers as employees) *
************************************************************

use "${analysis}/data/2_6_appx_dom_pnad_clean_duration_e.dta", clear
codebook ind_id


****************************************************************************
* Define covariates (same set for both employment and unemployment models) *
****************************************************************************

* Define set of covariates
unab covariates_all : rg_* ae_* at_* fp_* n_kids n_youngs n_adults n_seniors region_* // all
unab ref_groups : rg_1_nonwhite_female ae_1_1 fp_h_wp_nk region_SP_c // reference group: non white female, no education, head w partner no kids, Sao Paulo
global covariates : list covariates_all - ref_groups // final list of regressors


****************************************************************************************
* Starting point: Standard exponential model (no interval censoring, no heterogeneity) *
****************************************************************************************

* Set a transition point half-way between ta_e and tb_e
gen midpoint_e = (ta_e + tb_e)/2
replace midpoint_e = ta_e if missing(tb_e)

* Declare survival-time data with a single-failure per subject; reference interval unit is month
stset midpoint_e [fweight = fweight], failure(fail_e == 1) enter(t0_e) 

* Standard exponential model
eststo streg_e, title("Streg exponential"): streg $covariates, distribution(exponential) nohr
estimates save "${analysis}/results/estimations/4_4_appx_dom_mod_streg_e.ster", replace

* Store coefficients to use as initial values in the next step
matrix base_coeff_e = e(b)
matrix coleq base_coeff_e = scale // change the equation name from "_t" to "scale"


*****************************************************************************
* Constant hazards, all types of censoring and truncation, no heterogeneity *
*****************************************************************************

ml model d1 duration (scale: t0_e ta_e tb_e = $covariates) (shape:) (ln_u:) (ln_odds:) [fweight = fweight], ///
  missing constraint(1 2 3) technique(dfp 10 bfgs 10) ///
  diparm(ln_u, exp label("heterog") prob) ///
  diparm(ln_odds, invlogit label("share") prob)

ml init base_coeff_e
eststo ml_exp_e, title("Exponential"): ml maximize, difficult search(off)
estimates save "${analysis}/results/estimations/4_4_appx_dom_mod_ml_exp_e.ster", replace

* Store coefficients to use as initial values in the next step
matrix base_coeff_e = e(b)
matrix base_coeff_e[1, colnumb(base_coeff_e, "ln_u:_cons")] = 0    // initial value for high type baseline gap
matrix base_coeff_e[1, colnumb(base_coeff_e, "ln_odds:_cons")] = 0 // initial value for share of high type


***************************************************************************************
* Constant hazards, all types of censoring and truncation, 2-types mixture (BASELINE) *
***************************************************************************************

ml model d1 duration (scale: t0_e ta_e tb_e = $covariates) (shape:) (ln_u:) (ln_odds:) [fweight = fweight], ///
  missing constraint(1) technique(dfp 10 bfgs 10) ///
  diparm(ln_u, exp label("heterog") prob) ///
  diparm(ln_odds, invlogit label("share") prob)

ml init base_coeff_e
eststo ml_exp_mix_e, title("Exponential mixture"): ml maximize, difficult search(off)
estimates save "${analysis}/results/estimations/4_4_appx_dom_mod_ml_exp_mix_e.ster", replace


**************************************************************
* UNEMPLOYMENT duration (appendix, dom workers as employees) *
**************************************************************

* Import data
use "${analysis}/data/2_6_appx_dom_pnad_clean_duration_u.dta", clear
codebook ind_id


****************************************************************************************
* Starting point: Standard exponential model (no interval censoring, no heterogeneity) *
****************************************************************************************

* Set a transition point half way between ta_u and tb_u
gen midpoint_u = (ta_u + tb_u)/2
replace midpoint_u = ta_u if missing(tb_u)

* Declaring survival-time data with a single-failure per subjetc; reference interval unit is month
stset midpoint_u [fweight = fweight], failure(fail_u == 1) enter(t0_u) 

* Standard exponential model
eststo streg_u, title("Streg Exponential"): streg $covariates, distribution(exponential) nohr
estimates save "${analysis}/results/estimations/4_4_appx_dom_mod_streg_u.ster", replace

* Store coefficients to use as initial values in the next step
matrix base_coeff_u = e(b)
matrix coleq base_coeff_u = scale // change the equation name from "_t" to "scale"


*****************************************************************************
* Constant hazards, all types of censoring and truncation, no heterogeneity *
*****************************************************************************

ml model d1 duration (scale: t0_u ta_u tb_u = $covariates) (shape:) (ln_u:) (ln_odds:) [fweight = fweight], ///
  missing constraint(1 2 3) technique(dfp 10 bfgs 10) ///
  diparm(ln_u, exp label("heterog") prob) ///
  diparm(ln_odds, invlogit label("share") prob)

ml init base_coeff_u
eststo ml_exp_u, title("Exponential"): ml maximize, difficult search(off)
estimates save "${analysis}/results/estimations/4_4_appx_dom_mod_ml_exp_u.ster", replace

* Store coefficients to use as initial values in the next step
matrix base_coeff_u = e(b)
matrix base_coeff_u[1, colnumb(base_coeff_u, "ln_u:_cons")] = 0    // initial value for high type baseline gap
matrix base_coeff_u[1, colnumb(base_coeff_u, "ln_odds:_cons")] = 0 // initial value for share of high type


***************************************************************************************
* Constant hazards, all types of censoring and truncation, 2-types mixture (BASELINE) *
***************************************************************************************

ml model d1 duration (scale: t0_u ta_u tb_u = $covariates) (shape:) (ln_u:) (ln_odds:) [fweight = fweight], ///
  missing constraint(1) technique(dfp 10 bfgs 10) ///
  diparm(ln_u, exp label("heterog") prob) ///
  diparm(ln_odds, invlogit label("share") prob)

ml init base_coeff_u
eststo ml_exp_mix_u, title("Exponential mixture"): ml maximize, difficult search(off)
estimates save "${analysis}/results/estimations/4_4_appx_dom_mod_ml_exp_mix_u.ster", replace


***************************************************************************
* Predict out-of-employment and from-unemployment-into-employment hazards *
***************************************************************************

* In the exponential case, the hazard is simply exp(xb()) for any t 
* We weight the hazard by the share of the components in the mixture.

* Import POF
use "${analysis}/data/2_2_pof_clean.dta", clear
svyset psu_id [pweight = pweight], strata(strata_id) singleunit(centered) vce(linearized) 
keep if work_state_ibge_name == "Own-account worker"


* FULL SAMPLE: Out-of-employment hazards
****************************************

estimates use "${analysis}/results/estimations/4_4_appx_dom_mod_ml_exp_mix_e.ster"
predictnl appx_dom_haz_out_of_e_0 = (1 - invlogit(_b[ln_odds:_cons])) * exp(xb()) + invlogit(_b[ln_odds:_cons]) * exp(xb() + exp(_b[ln_u:_cons]))
svy: mean appx_dom_haz_out_of_e_0


* FULL SAMPLE: From-unemployment-into-employment hazards
********************************************************

estimates use "${analysis}/results/estimations/4_4_appx_dom_mod_ml_exp_mix_u.ster"
predictnl appx_dom_haz_from_u_to_e_0 = (1 - invlogit(_b[ln_odds:_cons])) * exp(xb()) + invlogit(_b[ln_odds:_cons]) * exp(xb() + exp(_b[ln_u:_cons]))
svy: mean appx_dom_haz_from_u_to_e_0


* Keep only the fitted values
*****************************

keep  ind_id appx_dom_haz_out_of_e_0 appx_dom_haz_from_u_to_e_0
order ind_id appx_dom_haz_out_of_e_0 appx_dom_haz_from_u_to_e_0
sort  ind_id
describe
save "${analysis}/data/4_4_appx_dom_est_hazards.dta", replace


* End of script
***************
cap log close